# Load API key and secret from environment variables
from dotenv import load_dotenv
load_dotenv()
# System libraries
import glob
# ML libraries
import pandas as pd
# ValidMind libraries
import validmind as vm
# Plotting libraries Mortgage Loan Rates Forecast Model
Data Engineering
Import Libraries
Data Collection
Load FRED Data
def merge_fred_csv_files(file_pattern):
# Use glob to find all files matching the specified pattern
file_list = glob.glob(file_pattern)
# Initialize an empty list to store individual DataFrames
dataframes = []
# Iterate through each file in the file list
for file in file_list:
# Read the CSV file into a DataFrame
df = pd.read_csv(file, parse_dates=['DATE'], index_col='DATE')
# Add the DataFrame to the list of DataFrames
dataframes.append(df)
# Merge all the DataFrames in the list into a single DataFrame
merged_df = pd.concat(dataframes, axis=1)
return merged_dffile_path = '../datasets/fred/*.csv'
fred_df = merge_fred_csv_files(file_path)
display(fred_df)| GDPC1 | GS5 | GS10 | GS3 | MORTGAGE30US | UNRATE | CPIAUCSL | FEDFUNDS | GDP | |
|---|---|---|---|---|---|---|---|---|---|
| DATE | |||||||||
| 1947-01-01 | 2034.450 | NaN | NaN | NaN | NaN | NaN | 21.48 | NaN | 243.164 |
| 1947-02-01 | NaN | NaN | NaN | NaN | NaN | NaN | 21.62 | NaN | NaN |
| 1947-03-01 | NaN | NaN | NaN | NaN | NaN | NaN | 22.00 | NaN | NaN |
| 1947-04-01 | 2029.024 | NaN | NaN | NaN | NaN | NaN | 22.00 | NaN | 245.968 |
| 1947-05-01 | NaN | NaN | NaN | NaN | NaN | NaN | 21.95 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2023-04-01 | NaN | NaN | 3.46 | NaN | NaN | NaN | NaN | NaN | NaN |
| 2023-04-06 | NaN | NaN | NaN | NaN | 6.28 | NaN | NaN | NaN | NaN |
| 2023-04-13 | NaN | NaN | NaN | NaN | 6.27 | NaN | NaN | NaN | NaN |
| 2023-04-20 | NaN | NaN | NaN | NaN | 6.39 | NaN | NaN | NaN | NaN |
| 2023-04-27 | NaN | NaN | NaN | NaN | 6.43 | NaN | NaN | NaN | NaN |
3551 rows × 9 columns
Preselection of Variables
target_column = ['MORTGAGE30US']
feature_columns = ['UNRATE', 'GS10', 'FEDFUNDS']
fred_df = fred_df[target_column + feature_columns]
display(fred_df)| MORTGAGE30US | UNRATE | GS10 | FEDFUNDS | |
|---|---|---|---|---|
| DATE | ||||
| 1947-01-01 | NaN | NaN | NaN | NaN |
| 1947-02-01 | NaN | NaN | NaN | NaN |
| 1947-03-01 | NaN | NaN | NaN | NaN |
| 1947-04-01 | NaN | NaN | NaN | NaN |
| 1947-05-01 | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... |
| 2023-04-01 | NaN | NaN | 3.46 | NaN |
| 2023-04-06 | 6.28 | NaN | NaN | NaN |
| 2023-04-13 | 6.27 | NaN | NaN | NaN |
| 2023-04-20 | 6.39 | NaN | NaN | NaN |
| 2023-04-27 | 6.43 | NaN | NaN | NaN |
3551 rows × 4 columns
ValidMind Setup
vm.init(
api_host = "http://localhost:3000/api/v1/tracking",
api_key = "e22b89a6b9c2a27da47cb0a09febc001",
api_secret = "a61be901b5596e3c528d94231e4a3c504ef0bb803d16815f8dfd6857fac03e57",
project = "clgo0g0rt0000fjy6ozl9pb69"
)
True
df = fred_df
vm_dataset = vm.init_dataset(dataset=df)Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...
Data Description
Data Quality
Frequency of the Series
Handling Frequencies
df = df.resample('MS').last()
vm_dataset = vm.init_dataset(dataset=df)Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...
Exploratory Data Analysis
Univariate Analysis
vm.test_plans.describe_plan("time_series_univariate")| Attribute | Value |
|---|---|
| ID | time_series_univariate |
| Name | TimeSeriesUnivariate |
| Description | Test plan to perform time series univariate analysis. |
| Required Context | ['dataset'] |
| Tests | TimeSeriesLinePlot (Metric), TimeSeriesHistogram (Metric), ACFandPACFPlot (Metric), SeasonalDecompose (Metric), AutoSeasonality (Metric), AutoStationarity (Metric), RollingStatsPlot (Metric), AutoAR (Metric), AutoMA (Metric) |
| Test Plans | [] |
test_plan_config = {
"time_series_line_plot": {
"columns": target_column + feature_columns
},
"time_series_histogram": {
"columns": target_column + feature_columns
},
"acf_pacf_plot": {
"columns": target_column + feature_columns
},
"auto_ar": {
"max_ar_order": 3
},
"auto_ma": {
"max_ma_order": 3
},
"seasonal_decompose": {
"seasonal_model": 'additive',
"fig_size": (40,30)
},
"auto_seasonality": {
"min_period": 1,
"max_period": 3
},
"auto_stationarity": {
"max_order": 3,
"threshold": 0.05
},
"rolling_stats_plot": {
"window_size": 12
},
}vm.run_test_plan("time_series_univariate", config=test_plan_config, dataset=vm_dataset)Running Metric: acf_pacf_plot: 22%|██▏ | 2/9 [00:00<00:01, 3.58it/s] The default method 'yw' can produce PACF values outside of the [-1,1] interval. After 0.13, the default will change tounadjusted Yule-Walker ('ywm'). You can use this method now by setting method='ywm'.
Running Metric: seasonal_decompose: 33%|███▎ | 3/9 [00:01<00:02, 2.37it/s]The default method 'yw' can produce PACF values outside of the [-1,1] interval. After 0.13, the default will change tounadjusted Yule-Walker ('ywm'). You can use this method now by setting method='ywm'.
The default method 'yw' can produce PACF values outside of the [-1,1] interval. After 0.13, the default will change tounadjusted Yule-Walker ('ywm'). You can use this method now by setting method='ywm'.
The default method 'yw' can produce PACF values outside of the [-1,1] interval. After 0.13, the default will change tounadjusted Yule-Walker ('ywm'). You can use this method now by setting method='ywm'.
The default method 'yw' can produce PACF values outside of the [-1,1] interval. After 0.13, the default will change tounadjusted Yule-Walker ('ywm'). You can use this method now by setting method='ywm'.
Running Metric: auto_ma: 89%|████████▉ | 8/9 [00:04<00:00, 2.48it/s]
Warning: MORTGAGE30US is not stationary. Results may be inaccurate.
Warning: GS10 is not stationary. Results may be inaccurate.
Warning: MORTGAGE30US is not stationary. Results may be inaccurate.
Non-invertible starting MA parameters found. Using zeros as starting parameters.
Non-invertible starting MA parameters found. Using zeros as starting parameters.
Warning: GS10 is not stationary. Results may be inaccurate.
Non-invertible starting MA parameters found. Using zeros as starting parameters.
Non-invertible starting MA parameters found. Using zeros as starting parameters.
Results for Time Series Univariate Test Plan:
This test plan provides a preliminary understanding of the target variable(s)
used in the time series dataset. It visualizations that present the raw time
series data and a histogram of the target variable(s).
The raw time series data provides a visual inspection of the target variable's
behavior over time. This helps to identify any patterns or trends in the data,
as well as any potential outliers or anomalies. The histogram of the target
variable displays the distribution of values, providing insight into the range
and frequency of values observed in the data.